Attention Mechanism Comparison¶
import torch
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.io as pio
import pandas as pd
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
from transformers import (
AutoTokenizer, AutoModel,
pipeline, AutoConfig
)
from datasets import load_dataset
from bertviz import model_view, head_view, neuron_view
from bertviz.transformers_neuron_view import BertModel
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
torch.manual_seed(42)
np.random.seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
Using device: cpu
import html
import re
def load_sentences(n_samples=100, min_words=10, max_words=20):
"""Load sentences from AG News dataset"""
dataset = load_dataset("ag_news", split="train")
label_names = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}
sentences = []
domains = []
samples_per_category = n_samples // 4
for label in range(4):
filtered = dataset.filter(lambda x: x['label'] == label)
shuffled = filtered.shuffle(seed=42)
count = 0
for item in shuffled:
if count >= samples_per_category:
break
# decode HTML and clean text
text = html.unescape(item['text'])
text = re.sub(r'<[^>]+>', '', text) # Remove HTML tags
text = re.sub(r'\s+', ' ', text).strip() # Normalize whitespace
# Check length
word_count = len(text.split())
if min_words <= word_count <= max_words and len(text) > 20:
sentences.append(text)
domains.append(label_names[label])
count += 1
print(f"\n Loaded {len(sentences)} sentences from AG News:")
for label_name in label_names.values():
count = domains.count(label_name)
print(f" {label_name}: {count} sentences")
print(f"\n Sample sentences:")
for i in range(min(3, len(sentences))):
print(f" [{domains[i]}] {sentences[i][:80]}...")
return sentences, domains
TEST_SENTENCES, SENTENCE_DOMAINS = load_sentences(n_samples=100, min_words=10, max_words=20)
Loaded 100 sentences from AG News: World: 25 sentences Sports: 25 sentences Business: 25 sentences Sci/Tech: 25 sentences Sample sentences: [World] Somalis vie to be new president Twenty-eight candidates are approved to contest ... [World] Agency pleads for hostage release Care International appeals on Arabic televisio... [World] Clinton recovering after heart op Former US President Bill Clinton's heart bypas...
MODELS = {
"BERT": {
"name": "bert-base-uncased",
"description": "Original BERT",
"color": "#FF6B6B"
},
"DistilBERT": {
"name": "distilbert-base-uncased",
"description": "Compressed BERT",
"color": "#4ECDC4"
},
"RoBERTa": {
"name": "roberta-base",
"description": "Optimized BERT",
"color": "#45B7D1"
}
}
def load_model_and_tokenizer(model_name):
"""
Load model and tokenizer for a given model name.
"""
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(
model_name,
output_attentions=True,
return_dict=True).to(device)
config = AutoConfig.from_pretrained(model_name)
print(f"Modèle {model_name} loaded")
return model, tokenizer,config
models_data = {}
for model_name, model_info in MODELS.items():
model, tokenizer, config = load_model_and_tokenizer(model_info["name"])
models_data[model_name] = {
"model": model,
"tokenizer": tokenizer,
"config": config,
"description": model_info["description"],
"color": model_info["color"]
}
print("✅ All models loaded")
Modèle bert-base-uncased loaded Modèle distilbert-base-uncased loaded
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Modèle roberta-base loaded ✅ All models loaded
def analyze_model_architecture(models_data):
"""Compare architecture of different models."""
arch_data = []
for model_name, data in models_data.items():
config = data["config"]
model = data["model"]
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
arch_info = {
"Model": model_name,
"Layers": config.num_hidden_layers,
"Hidden Size": config.hidden_size,
"Attention Heads": config.num_attention_heads,
"Vocab Size": config.vocab_size,
"Total Params": f"{total_params:,}",
"Trainable Params": f"{trainable_params:,}",
}
arch_data.append(arch_info)
print(f"\n{model_name}:")
print(f" Layers: {config.num_hidden_layers}")
print(f" Hidden Size: {config.hidden_size}")
print(f" Attention Heads: {config.num_attention_heads}")
print(f" Vocab Size: {config.vocab_size:,}")
print(f" Parameters: {total_params:,}")
return pd.DataFrame(arch_data)
architecture_df = analyze_model_architecture(models_data)
print("\n Comparison table:")
display(architecture_df)
BERT: Layers: 12 Hidden Size: 768 Attention Heads: 12 Vocab Size: 30,522 Parameters: 109,482,240 DistilBERT: Layers: 6 Hidden Size: 768 Attention Heads: 12 Vocab Size: 30,522 Parameters: 66,362,880 RoBERTa: Layers: 12 Hidden Size: 768 Attention Heads: 12 Vocab Size: 50,265 Parameters: 124,645,632 Comparison table:
| Model | Layers | Hidden Size | Attention Heads | Vocab Size | Total Params | Trainable Params | |
|---|---|---|---|---|---|---|---|
| 0 | BERT | 12 | 768 | 12 | 30522 | 109,482,240 | 109,482,240 |
| 1 | DistilBERT | 6 | 768 | 12 | 30522 | 66,362,880 | 66,362,880 |
| 2 | RoBERTa | 12 | 768 | 12 | 50265 | 124,645,632 | 124,645,632 |
def analyze_sentence_attention(sentence, model_name, models_data):
"""Analyze attention patterns for a given sentence and model."""
print(f"Analyze attention for: '{sentence}'")
print(f"Model: {model_name}")
tokenizer = models_data[model_name]["tokenizer"]
model = models_data[model_name]["model"]
inputs = tokenizer(sentence, return_tensors="pt", truncation=True)
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
print(f"Tokens ({len(tokens)}): {tokens[:10]}{'...' if len(tokens) > 10 else ''}")
# Prediction with attention
with torch.no_grad():
outputs = model(**inputs)
attentions = outputs.attentions
num_layers = len(attentions)
num_heads = attentions[0].shape[1]
seq_length = attentions[0].shape[-1]
print(f"Attention shape: {num_layers} layers, {num_heads} heads, {seq_length} tokens")
return {
"tokens": tokens,
"attentions": attentions,
"inputs": inputs,
"num_layers": num_layers,
"num_heads": num_heads
}
test_sentence = TEST_SENTENCES[1]
print(f" Test on: '{test_sentence}'\n")
attention_results = {}
for model_name in models_data.keys():
attention_results[model_name] = analyze_sentence_attention(
test_sentence, model_name, models_data
)
print("-" * 50)
Test on: 'Agency pleads for hostage release Care International appeals on Arabic television for the release of its Iraq director, Margaret Hassan.' Analyze attention for: 'Agency pleads for hostage release Care International appeals on Arabic television for the release of its Iraq director, Margaret Hassan.' Model: BERT Tokens (25): ['[CLS]', 'agency', 'plead', '##s', 'for', 'hostage', 'release', 'care', 'international', 'appeals']... Attention shape: 12 layers, 12 heads, 25 tokens -------------------------------------------------- Analyze attention for: 'Agency pleads for hostage release Care International appeals on Arabic television for the release of its Iraq director, Margaret Hassan.' Model: DistilBERT Tokens (25): ['[CLS]', 'agency', 'plead', '##s', 'for', 'hostage', 'release', 'care', 'international', 'appeals']... Attention shape: 6 layers, 12 heads, 25 tokens -------------------------------------------------- Analyze attention for: 'Agency pleads for hostage release Care International appeals on Arabic television for the release of its Iraq director, Margaret Hassan.' Model: RoBERTa Tokens (26): ['<s>', 'A', 'gency', 'Ġple', 'ads', 'Ġfor', 'Ġhostage', 'Ġrelease', 'ĠCare', 'ĠInternational']... Attention shape: 12 layers, 12 heads, 26 tokens --------------------------------------------------
from IPython.display import display, HTML
def create_attention_visualizations(sentence, model_name, models_data):
"""Create attention visualizations with BertViz"""
tokenizer = models_data[model_name]["tokenizer"]
model = models_data[model_name]["model"]
inputs = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=512)
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad():
outputs = model(**inputs)
attentions = outputs.attentions
print(f"Tokens: {len(tokens)} tokens")
print(f"Attention: {len(attentions)} layers, {attentions[0].shape[1]} heads")
print("Model View (all attention heads):")
model_view(attentions, tokens)
print("Head View (per head detail):")
head_view(attentions, tokens)
return {
"tokens": tokens,
"attentions": attentions,
"num_layers": len(attentions),
"num_heads": attentions[0].shape[1]
}
visualization_results = {}
for model_name in models_data.keys():
print(f"\n{'='*60}")
print(f"VISUALIZING {model_name}")
print(f"{'='*60}")
result = create_attention_visualizations(test_sentence, model_name, models_data)
visualization_results[model_name] = result
============================================================ VISUALIZING BERT ============================================================ Tokens: 25 tokens Attention: 12 layers, 12 heads Model View (all attention heads):
Head View (per head detail):
============================================================ VISUALIZING DistilBERT ============================================================ Tokens: 25 tokens Attention: 6 layers, 12 heads Model View (all attention heads):
Head View (per head detail):
============================================================ VISUALIZING RoBERTa ============================================================ Tokens: 26 tokens Attention: 12 layers, 12 heads Model View (all attention heads):
Head View (per head detail):
Understanding Attention Patterns¶
What are Layers and Heads?¶
Layers are like steps in processing - each layer refines the understanding:
- Early layers (0-3): Focus on grammar and word relationships
- Middle layers (4-8): Build meaning and context
- Final layers (9-11): Aggregate information for the final output
Attention Heads decide which words are important. Think of it like each word "looking at" other words to understand context. With 12 heads per layer, the model examines different aspects simultaneously.
Reading the Visualizations¶
Model View: Shows which words pay attention to which. Brighter colors = stronger attention.
Head View: A grid showing all attention heads across all layers. Each small matrix shows the attention pattern for one head in one layer.
Comparing the Models¶
BERT - Baseline with hierarchical patterns from syntax to semantics
DistilBERT - Compressed to 6 layers but maintains effectiveness with more focused patterns
RoBERTa - Optimized training leads to cleaner, more targeted attention patterns
def analyze_attention_patterns(sentence, models_data):
"""Analyze attention patterns for different linguistic phenomena"""
print(f"ANALYSE PATTERNS: '{sentence}'")
print("=" * 60)
patterns_analysis = {}
for model_name, data in models_data.items():
print(f"\n{model_name}:")
tokenizer = data["tokenizer"]
model = data["model"]
inputs = tokenizer(sentence, return_tensors="pt", return_offsets_mapping=False)
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
with torch.no_grad():
outputs = model(**inputs)
attentions = outputs.attentions
last_attention = attentions[-1][0] # [heads, seq_len, seq_len]
# 1. Attention to [CLS] token
cls_attention = last_attention[:, 0, :].mean(dim=0) # Mean over heads
# 2. Self-attention (attention of a token to itself)
self_attention = torch.diagonal(last_attention.mean(dim=0))
# 3. Attention to content words vs function words
content_words = []
function_words = []
for i, token in enumerate(tokens):
if token.startswith('##') or token in ['[CLS]', '[SEP]', '[PAD]']:
continue
elif token.lower() in ['the', 'a', 'an', 'is', 'are', 'was', 'were', 'of', 'in', 'on', 'at']:
function_words.append(i)
else:
content_words.append(i)
avg_attention_to_content = last_attention.mean(dim=0)[:, content_words].mean() if content_words else 0
avg_attention_to_function = last_attention.mean(dim=0)[:, function_words].mean() if function_words else 0
patterns = {
"cls_attention_max": float(cls_attention.max()),
"cls_attention_mean": float(cls_attention.mean()),
"self_attention_mean": float(self_attention.mean()),
"content_vs_function": float(avg_attention_to_content / max(avg_attention_to_function, 0.001)),
"attention_entropy": float(-torch.sum(last_attention.mean(dim=0) * torch.log(last_attention.mean(dim=0) + 1e-10)).mean())
}
patterns_analysis[model_name] = patterns
print(f" CLS attention max: {patterns['cls_attention_max']:.3f}")
print(f" Self-attention mean: {patterns['self_attention_mean']:.3f}")
print(f" Content/Function ratio: {patterns['content_vs_function']:.2f}")
print(f" Attention entropy: {patterns['attention_entropy']:.3f}")
return patterns_analysis
all_patterns = {}
for i, sentence in enumerate(TEST_SENTENCES):
print(f"\n{'='*20} SENTENCE {i+1} {'='*20}")
patterns = analyze_attention_patterns(sentence, models_data)
all_patterns[f"Sentence_{i+1}"] = patterns
==================== SENTENCE 1 ==================== ANALYSE PATTERNS: 'Somalis vie to be new president Twenty-eight candidates are approved to contest next week's elections to be Somalia's new leader.' ============================================================ BERT: CLS attention max: 0.108 Self-attention mean: 0.088 Content/Function ratio: 1.35 Attention entropy: 61.730 DistilBERT: CLS attention max: 0.171 Self-attention mean: 0.070 Content/Function ratio: 0.89 Attention entropy: 62.514 RoBERTa: CLS attention max: 0.281 Self-attention mean: 0.128 Content/Function ratio: 3.81 Attention entropy: 60.199 ==================== SENTENCE 2 ==================== ANALYSE PATTERNS: 'Agency pleads for hostage release Care International appeals on Arabic television for the release of its Iraq director, Margaret Hassan.' ============================================================ BERT: CLS attention max: 0.102 Self-attention mean: 0.112 Content/Function ratio: 1.59 Attention entropy: 53.609 DistilBERT: CLS attention max: 0.196 Self-attention mean: 0.083 Content/Function ratio: 2.18 Attention entropy: 50.663 RoBERTa: CLS attention max: 0.270 Self-attention mean: 0.128 Content/Function ratio: 1.50 Attention entropy: 53.205 ==================== SENTENCE 3 ==================== ANALYSE PATTERNS: 'Clinton recovering after heart op Former US President Bill Clinton's heart bypass operation is successful, say doctors in New York.' ============================================================ BERT: CLS attention max: 0.104 Self-attention mean: 0.116 Content/Function ratio: 0.76 Attention entropy: 49.653 DistilBERT: CLS attention max: 0.183 Self-attention mean: 0.085 Content/Function ratio: 1.00 Attention entropy: 54.519 RoBERTa: CLS attention max: 0.259 Self-attention mean: 0.121 Content/Function ratio: 40.00 Attention entropy: 51.660 ==================== SENTENCE 4 ==================== ANALYSE PATTERNS: 'Tired of post-9/11 hassles, Arab tourists head east Saudi visitors to Malaysia were up 53 percent in 2004.' ============================================================ BERT: CLS attention max: 0.107 Self-attention mean: 0.102 Content/Function ratio: 2.16 Attention entropy: 63.509 DistilBERT: CLS attention max: 0.124 Self-attention mean: 0.078 Content/Function ratio: 2.13 Attention entropy: 62.896 RoBERTa: CLS attention max: 0.260 Self-attention mean: 0.142 Content/Function ratio: 34.48 Attention entropy: 58.007 ==================== SENTENCE 5 ==================== ANALYSE PATTERNS: 'Greenspan warns over US deficit Federal Reserve chief Alan Greenspan says the US trade deficit cannot be sustained indefinitely.' ============================================================ BERT: CLS attention max: 0.199 Self-attention mean: 0.116 Content/Function ratio: 2.66 Attention entropy: 46.574 DistilBERT: CLS attention max: 0.241 Self-attention mean: 0.089 Content/Function ratio: 1.86 Attention entropy: 45.872 RoBERTa: CLS attention max: 0.281 Self-attention mean: 0.141 Content/Function ratio: 40.00 Attention entropy: 51.452 ==================== SENTENCE 6 ==================== ANALYSE PATTERNS: 'Landmine kills Afghan policemen Seven Afghan policemen are killed by a landmine in the Kandahar province, ahead of landmark elections.' ============================================================ BERT: CLS attention max: 0.106 Self-attention mean: 0.097 Content/Function ratio: 2.36 Attention entropy: 62.710 DistilBERT: CLS attention max: 0.142 Self-attention mean: 0.085 Content/Function ratio: 2.64 Attention entropy: 63.065 RoBERTa: CLS attention max: 0.272 Self-attention mean: 0.115 Content/Function ratio: 37.04 Attention entropy: 55.344 ==================== SENTENCE 7 ==================== ANALYSE PATTERNS: 'Ghana votes in presidential poll Presidential elections are taking place in Ghana, which boasts political stability but much poverty.' ============================================================ BERT: CLS attention max: 0.176 Self-attention mean: 0.112 Content/Function ratio: 2.83 Attention entropy: 48.111 DistilBERT: CLS attention max: 0.135 Self-attention mean: 0.083 Content/Function ratio: 2.47 Attention entropy: 48.360 RoBERTa: CLS attention max: 0.267 Self-attention mean: 0.142 Content/Function ratio: 41.67 Attention entropy: 50.312 ==================== SENTENCE 8 ==================== ANALYSE PATTERNS: 'New Zimbabwe restrictions target aid groups A proposed law would limit foreign funding of churches and AIDS programs.' ============================================================ BERT: CLS attention max: 0.158 Self-attention mean: 0.129 Content/Function ratio: 1.61 Attention entropy: 41.358 DistilBERT: CLS attention max: 0.175 Self-attention mean: 0.088 Content/Function ratio: 1.97 Attention entropy: 40.642 RoBERTa: CLS attention max: 0.277 Self-attention mean: 0.118 Content/Function ratio: 47.62 Attention entropy: 42.179 ==================== SENTENCE 9 ==================== ANALYSE PATTERNS: 'German trial stirs torture debate An ex-police officer goes on trial in Germany charged with threatening a suspect with torture.' ============================================================ BERT: CLS attention max: 0.119 Self-attention mean: 0.107 Content/Function ratio: 2.05 Attention entropy: 54.029 DistilBERT: CLS attention max: 0.193 Self-attention mean: 0.086 Content/Function ratio: 2.28 Attention entropy: 54.654 RoBERTa: CLS attention max: 0.277 Self-attention mean: 0.132 Content/Function ratio: 38.46 Attention entropy: 51.046 ==================== SENTENCE 10 ==================== ANALYSE PATTERNS: 'Painkiller risk to gut revealed The risk of intestinal damage from common painkillers may be higher than thought, research suggests.' ============================================================ BERT: CLS attention max: 0.144 Self-attention mean: 0.109 Content/Function ratio: 2.25 Attention entropy: 63.005 DistilBERT: CLS attention max: 0.176 Self-attention mean: 0.082 Content/Function ratio: 2.47 Attention entropy: 62.340 RoBERTa: CLS attention max: 0.263 Self-attention mean: 0.123 Content/Function ratio: 38.46 Attention entropy: 50.159 ==================== SENTENCE 11 ==================== ANALYSE PATTERNS: 'Fighting rages in South Ossetia Heavy fighting erupts in Georgia's breakaway South Ossetia region, shattering a two-day ceasefire.' ============================================================ BERT: CLS attention max: 0.145 Self-attention mean: 0.079 Content/Function ratio: 4.34 Attention entropy: 65.446 DistilBERT: CLS attention max: 0.150 Self-attention mean: 0.064 Content/Function ratio: 2.69 Attention entropy: 74.924 RoBERTa: CLS attention max: 0.267 Self-attention mean: 0.110 Content/Function ratio: 31.25 Attention entropy: 67.078 ==================== SENTENCE 12 ==================== ANALYSE PATTERNS: ''Few ready' for information act Public bodies are ill-prepared for the Freedom of Information Act, says a group of MPs.' ============================================================ BERT: CLS attention max: 0.089 Self-attention mean: 0.102 Content/Function ratio: 2.80 Attention entropy: 56.225 DistilBERT: CLS attention max: 0.184 Self-attention mean: 0.089 Content/Function ratio: 1.95 Attention entropy: 58.090 RoBERTa: CLS attention max: 0.250 Self-attention mean: 0.146 Content/Function ratio: 34.48 Attention entropy: 60.050 ==================== SENTENCE 13 ==================== ANALYSE PATTERNS: 'Charges over Montenegro killing Montenegro prosecutors charge a former karate champion over the death of a newspaper editor.' ============================================================ BERT: CLS attention max: 0.115 Self-attention mean: 0.115 Content/Function ratio: 1.61 Attention entropy: 41.917 DistilBERT: CLS attention max: 0.211 Self-attention mean: 0.097 Content/Function ratio: 1.89 Attention entropy: 42.223 RoBERTa: CLS attention max: 0.299 Self-attention mean: 0.138 Content/Function ratio: 41.67 Attention entropy: 49.930 ==================== SENTENCE 14 ==================== ANALYSE PATTERNS: ''Distressed' Thatcher flies home Baroness Thatcher returns home as it emerges her son Sir Mark could face extradition proceedings.' ============================================================ BERT: CLS attention max: 0.166 Self-attention mean: 0.101 Content/Function ratio: 29.91 Attention entropy: 45.584 DistilBERT: CLS attention max: 0.223 Self-attention mean: 0.084 Content/Function ratio: 34.17 Attention entropy: 50.705 RoBERTa: CLS attention max: 0.276 Self-attention mean: 0.150 Content/Function ratio: 38.46 Attention entropy: 49.948 ==================== SENTENCE 15 ==================== ANALYSE PATTERNS: 'U.S. Marine Killed in Anbar Province U.S. Marine killed in Iraq's Anbar provinceBC-Iraq-Military Death,0115' ============================================================ BERT: CLS attention max: 0.093 Self-attention mean: 0.094 Content/Function ratio: 2.88 Attention entropy: 89.299 DistilBERT: CLS attention max: 0.115 Self-attention mean: 0.066 Content/Function ratio: 3.25 Attention entropy: 86.616 RoBERTa: CLS attention max: 0.289 Self-attention mean: 0.107 Content/Function ratio: 30.30 Attention entropy: 67.425 ==================== SENTENCE 16 ==================== ANALYSE PATTERNS: 'Alcohol hampers depth perception Drinking alcohol impairs driving ability by disrupting depth perception, researchers find.' ============================================================ BERT: CLS attention max: 0.118 Self-attention mean: 0.103 Content/Function ratio: 33.39 Attention entropy: 41.201 DistilBERT: CLS attention max: 0.229 Self-attention mean: 0.100 Content/Function ratio: 27.23 Attention entropy: 47.713 RoBERTa: CLS attention max: 0.292 Self-attention mean: 0.122 Content/Function ratio: 43.48 Attention entropy: 44.052 ==================== SENTENCE 17 ==================== ANALYSE PATTERNS: 'Table tennis: Gold for China Zhang Yining beats North Korea's Kim Hyang-Mi to win the table tennis women's singles.' ============================================================ BERT: CLS attention max: 0.132 Self-attention mean: 0.094 Content/Function ratio: 2.06 Attention entropy: 66.171 DistilBERT: CLS attention max: 0.155 Self-attention mean: 0.070 Content/Function ratio: 2.29 Attention entropy: 66.386 RoBERTa: CLS attention max: 0.269 Self-attention mean: 0.112 Content/Function ratio: 34.48 Attention entropy: 58.151 ==================== SENTENCE 18 ==================== ANALYSE PATTERNS: 'Swiss 'reject' citizenship reform Swiss voters appear to have rejected proposals to relax the country's strict naturalisation laws.' ============================================================ BERT: CLS attention max: 0.151 Self-attention mean: 0.095 Content/Function ratio: 2.18 Attention entropy: 47.602 DistilBERT: CLS attention max: 0.201 Self-attention mean: 0.080 Content/Function ratio: 1.91 Attention entropy: 51.887 RoBERTa: CLS attention max: 0.276 Self-attention mean: 0.137 Content/Function ratio: 37.04 Attention entropy: 53.779 ==================== SENTENCE 19 ==================== ANALYSE PATTERNS: 'Maradona 'can be treated abroad' Former football star Diego Maradona can return to Cuba for drug rehabilitation, his lawyer says.' ============================================================ BERT: CLS attention max: 0.120 Self-attention mean: 0.092 Content/Function ratio: 25.88 Attention entropy: 61.333 DistilBERT: CLS attention max: 0.174 Self-attention mean: 0.083 Content/Function ratio: 30.65 Attention entropy: 64.968 RoBERTa: CLS attention max: 0.279 Self-attention mean: 0.127 Content/Function ratio: 33.33 Attention entropy: 62.869 ==================== SENTENCE 20 ==================== ANALYSE PATTERNS: 'Cambodia set to crown new king Cambodians prepare for the coronation of King Sihamoni, amid an array of official festivities.' ============================================================ BERT: CLS attention max: 0.106 Self-attention mean: 0.087 Content/Function ratio: 3.80 Attention entropy: 58.372 DistilBERT: CLS attention max: 0.141 Self-attention mean: 0.080 Content/Function ratio: 3.51 Attention entropy: 60.281 RoBERTa: CLS attention max: 0.266 Self-attention mean: 0.119 Content/Function ratio: 32.26 Attention entropy: 61.734 ==================== SENTENCE 21 ==================== ANALYSE PATTERNS: 'Observers approve Afghan election International observers say calls to annul the Afghan presidential poll on grounds of fraud are unjustified.' ============================================================ BERT: CLS attention max: 0.122 Self-attention mean: 0.099 Content/Function ratio: 2.11 Attention entropy: 52.553 DistilBERT: CLS attention max: 0.205 Self-attention mean: 0.080 Content/Function ratio: 2.58 Attention entropy: 51.205 RoBERTa: CLS attention max: 0.275 Self-attention mean: 0.129 Content/Function ratio: 37.04 Attention entropy: 53.768 ==================== SENTENCE 22 ==================== ANALYSE PATTERNS: 'Burma crackdown on luxury cars Burma investigates illegally imported luxury cars, the latest repercussion of former PM Khin Nyunt's ouster.' ============================================================ BERT: CLS attention max: 0.129 Self-attention mean: 0.096 Content/Function ratio: 1.99 Attention entropy: 69.674 DistilBERT: CLS attention max: 0.213 Self-attention mean: 0.078 Content/Function ratio: 1.96 Attention entropy: 70.122 RoBERTa: CLS attention max: 0.265 Self-attention mean: 0.131 Content/Function ratio: 2.37 Attention entropy: 58.381 ==================== SENTENCE 23 ==================== ANALYSE PATTERNS: 'Turkish hostages 'killed' in Iraq An Iraqi militant group kills three Turkish hostages, reports Arabic TV station al-Jazeera.' ============================================================ BERT: CLS attention max: 0.098 Self-attention mean: 0.114 Content/Function ratio: 1.30 Attention entropy: 56.145 DistilBERT: CLS attention max: 0.175 Self-attention mean: 0.081 Content/Function ratio: 1.31 Attention entropy: 57.362 RoBERTa: CLS attention max: 0.282 Self-attention mean: 0.112 Content/Function ratio: 37.04 Attention entropy: 53.194 ==================== SENTENCE 24 ==================== ANALYSE PATTERNS: 'Home users get Windows update Microsoft is making its important security update for Windows XP available on auto-update servers today.' ============================================================ BERT: CLS attention max: 0.132 Self-attention mean: 0.124 Content/Function ratio: 2.20 Attention entropy: 51.523 DistilBERT: CLS attention max: 0.221 Self-attention mean: 0.085 Content/Function ratio: 1.87 Attention entropy: 52.584 RoBERTa: CLS attention max: 0.259 Self-attention mean: 0.133 Content/Function ratio: 40.00 Attention entropy: 56.743 ==================== SENTENCE 25 ==================== ANALYSE PATTERNS: 'Chinese firm buys IBM PC business IBM is selling its PC hardware business to number one Chinese computer maker Lenovo.' ============================================================ BERT: CLS attention max: 0.146 Self-attention mean: 0.123 Content/Function ratio: 1.92 Attention entropy: 47.939 DistilBERT: CLS attention max: 0.215 Self-attention mean: 0.088 Content/Function ratio: 1.42 Attention entropy: 45.450 RoBERTa: CLS attention max: 0.299 Self-attention mean: 0.132 Content/Function ratio: 43.48 Attention entropy: 46.955 ==================== SENTENCE 26 ==================== ANALYSE PATTERNS: 'Today's schedule Pro basketball: WNBA playoffs: Sun vs. Washington (Game 3) at Mohegan Sun Arena, Uncasville, Conn., 8 p.m.' ============================================================ BERT: CLS attention max: 0.085 Self-attention mean: 0.091 Content/Function ratio: 2.85 Attention entropy: 102.113 DistilBERT: CLS attention max: 0.105 Self-attention mean: 0.062 Content/Function ratio: 3.46 Attention entropy: 105.811 RoBERTa: CLS attention max: 0.267 Self-attention mean: 0.116 Content/Function ratio: 26.32 Attention entropy: 83.911 ==================== SENTENCE 27 ==================== ANALYSE PATTERNS: 'NFL Pass-Interference Crackwon Draws Fire (AP) AP - Darren Sharper is upset about the NFL's crackdown on pass interference.' ============================================================ BERT: CLS attention max: 0.146 Self-attention mean: 0.089 Content/Function ratio: 2.28 Attention entropy: 69.033 DistilBERT: CLS attention max: 0.181 Self-attention mean: 0.076 Content/Function ratio: 1.27 Attention entropy: 68.324 RoBERTa: CLS attention max: 0.241 Self-attention mean: 0.137 Content/Function ratio: 32.26 Attention entropy: 61.675 ==================== SENTENCE 28 ==================== ANALYSE PATTERNS: 'Transactions BASEBALL Arizona (NL): Signed a two-year player development agreement with Tennessee (Southern).' ============================================================ BERT: CLS attention max: 0.108 Self-attention mean: 0.112 Content/Function ratio: 3.05 Attention entropy: 46.106 DistilBERT: CLS attention max: 0.170 Self-attention mean: 0.091 Content/Function ratio: 1.98 Attention entropy: 52.840 RoBERTa: CLS attention max: 0.343 Self-attention mean: 0.171 Content/Function ratio: 43.48 Attention entropy: 46.249 ==================== SENTENCE 29 ==================== ANALYSE PATTERNS: 'Rams Roll Past Redskins 28-3 (AP) AP - Torry Holt and the St. Louis Rams finally had something to celebrate.' ============================================================ BERT: CLS attention max: 0.110 Self-attention mean: 0.089 Content/Function ratio: 3.63 Attention entropy: 59.903 DistilBERT: CLS attention max: 0.165 Self-attention mean: 0.081 Content/Function ratio: 1.52 Attention entropy: 66.351 RoBERTa: CLS attention max: 0.275 Self-attention mean: 0.138 Content/Function ratio: 33.33 Attention entropy: 58.415 ==================== SENTENCE 30 ==================== ANALYSE PATTERNS: 'Quincy gets its revenge It took a year, but Quincy's volleyball team has bragging rights in the city again.' ============================================================ BERT: CLS attention max: 0.116 Self-attention mean: 0.099 Content/Function ratio: 3.37 Attention entropy: 57.322 DistilBERT: CLS attention max: 0.161 Self-attention mean: 0.087 Content/Function ratio: 2.74 Attention entropy: 61.465 RoBERTa: CLS attention max: 0.278 Self-attention mean: 0.144 Content/Function ratio: 2.06 Attention entropy: 52.182 ==================== SENTENCE 31 ==================== ANALYSE PATTERNS: 'Macey surges into fourth Britain's Dean Macey lies in fourth place after five events of the decathlon.' ============================================================ BERT: CLS attention max: 0.118 Self-attention mean: 0.114 Content/Function ratio: 3.06 Attention entropy: 51.432 DistilBERT: CLS attention max: 0.228 Self-attention mean: 0.090 Content/Function ratio: 2.68 Attention entropy: 53.708 RoBERTa: CLS attention max: 0.275 Self-attention mean: 0.148 Content/Function ratio: 41.67 Attention entropy: 44.847 ==================== SENTENCE 32 ==================== ANALYSE PATTERNS: 'Ruffin Grabs Attention Forward Michael Ruffin is impressing Wizards coaches with his rebounding and rough play under the basket.' ============================================================ BERT: CLS attention max: 0.137 Self-attention mean: 0.109 Content/Function ratio: 1.98 Attention entropy: 48.097 DistilBERT: CLS attention max: 0.224 Self-attention mean: 0.092 Content/Function ratio: 1.33 Attention entropy: 56.100 RoBERTa: CLS attention max: 0.259 Self-attention mean: 0.137 Content/Function ratio: 2.26 Attention entropy: 55.430 ==================== SENTENCE 33 ==================== ANALYSE PATTERNS: 'Rebels Target Erickson Ole Miss has received permission from the San Francisco 49ers to speak with head coach Dennis Erickson.' ============================================================ BERT: CLS attention max: 0.121 Self-attention mean: 0.118 Content/Function ratio: 2.24 Attention entropy: 46.629 DistilBERT: CLS attention max: 0.178 Self-attention mean: 0.091 Content/Function ratio: 2.49 Attention entropy: 50.740 RoBERTa: CLS attention max: 0.272 Self-attention mean: 0.120 Content/Function ratio: 37.04 Attention entropy: 60.282 ==================== SENTENCE 34 ==================== ANALYSE PATTERNS: 'US NBA players become the Nightmare Team after epic loss (AFP) AFP - Call them the "Nightmare Team".' ============================================================ BERT: CLS attention max: 0.086 Self-attention mean: 0.117 Content/Function ratio: 2.18 Attention entropy: 57.136 DistilBERT: CLS attention max: 0.183 Self-attention mean: 0.094 Content/Function ratio: 1.51 Attention entropy: 62.509 RoBERTa: CLS attention max: 0.291 Self-attention mean: 0.140 Content/Function ratio: 40.00 Attention entropy: 50.283 ==================== SENTENCE 35 ==================== ANALYSE PATTERNS: 'Sportsview: Eagles Have Attitude, Talent (AP) AP - The Philadelphia Eagles had talent. Now they have swagger and personality, too.' ============================================================ BERT: CLS attention max: 0.081 Self-attention mean: 0.097 Content/Function ratio: 3.19 Attention entropy: 78.178 DistilBERT: CLS attention max: 0.109 Self-attention mean: 0.085 Content/Function ratio: 2.05 Attention entropy: 78.285 RoBERTa: CLS attention max: 0.186 Self-attention mean: 0.111 Content/Function ratio: 31.25 Attention entropy: 73.647 ==================== SENTENCE 36 ==================== ANALYSE PATTERNS: 'Colts' Freeney Rushes to Get Most Sacks (AP) AP - Dwight Freeney always has gotten the attention of offenses.' ============================================================ BERT: CLS attention max: 0.102 Self-attention mean: 0.115 Content/Function ratio: 3.15 Attention entropy: 58.072 DistilBERT: CLS attention max: 0.177 Self-attention mean: 0.086 Content/Function ratio: 2.12 Attention entropy: 60.464 RoBERTa: CLS attention max: 0.263 Self-attention mean: 0.127 Content/Function ratio: 31.25 Attention entropy: 66.365 ==================== SENTENCE 37 ==================== ANALYSE PATTERNS: 'Greek pair await IAAF fate Kostas Kenteris and Katerina Thanou's missed drugs tests will be investigated by the IAAF.' ============================================================ BERT: CLS attention max: 0.131 Self-attention mean: 0.102 Content/Function ratio: 3.29 Attention entropy: 56.117 DistilBERT: CLS attention max: 0.202 Self-attention mean: 0.074 Content/Function ratio: 3.05 Attention entropy: 54.536 RoBERTa: CLS attention max: 0.244 Self-attention mean: 0.117 Content/Function ratio: 3.44 Attention entropy: 63.883 ==================== SENTENCE 38 ==================== ANALYSE PATTERNS: 'Barnstable meets challenge The Barnstable girls' volleyball team was looking for challenging nonleague opponents before the state tournament.' ============================================================ BERT: CLS attention max: 0.112 Self-attention mean: 0.108 Content/Function ratio: 2.46 Attention entropy: 48.648 DistilBERT: CLS attention max: 0.205 Self-attention mean: 0.093 Content/Function ratio: 2.18 Attention entropy: 50.728 RoBERTa: CLS attention max: 0.264 Self-attention mean: 0.136 Content/Function ratio: 40.00 Attention entropy: 50.007 ==================== SENTENCE 39 ==================== ANALYSE PATTERNS: 'SI.com ST. LOUIS (Ticker) -- The Cincinnati Reds continue to find new ways to lose to the St. Louis Cardinals.' ============================================================ BERT: CLS attention max: 0.084 Self-attention mean: 0.095 Content/Function ratio: 3.87 Attention entropy: 70.721 DistilBERT: CLS attention max: 0.103 Self-attention mean: 0.078 Content/Function ratio: 1.85 Attention entropy: 83.986 RoBERTa: CLS attention max: 0.263 Self-attention mean: 0.110 Content/Function ratio: 1.73 Attention entropy: 65.946 ==================== SENTENCE 40 ==================== ANALYSE PATTERNS: 'Tyson Completes Service Charges stemming from a 2003 altercation are dropped as Mike Tyson completes community service on Wednesday.' ============================================================ BERT: CLS attention max: 0.114 Self-attention mean: 0.107 Content/Function ratio: 2.47 Attention entropy: 41.588 DistilBERT: CLS attention max: 0.187 Self-attention mean: 0.096 Content/Function ratio: 1.83 Attention entropy: 48.059 RoBERTa: CLS attention max: 0.279 Self-attention mean: 0.149 Content/Function ratio: 40.00 Attention entropy: 50.392 ==================== SENTENCE 41 ==================== ANALYSE PATTERNS: 'Transactions BASEBALL Cleveland (AL): Sold INF Erick Almonte to Nippon (Japan). New York (AL): Signed P Tanyon Sturtze.' ============================================================ BERT: CLS attention max: 0.082 Self-attention mean: 0.096 Content/Function ratio: 2.84 Attention entropy: 85.574 DistilBERT: CLS attention max: 0.110 Self-attention mean: 0.077 Content/Function ratio: 3.54 Attention entropy: 95.095 RoBERTa: CLS attention max: 0.275 Self-attention mean: 0.114 Content/Function ratio: 3.96 Attention entropy: 75.298 ==================== SENTENCE 42 ==================== ANALYSE PATTERNS: 'Transactions BASEBALL Seattle (AL): Named Mike Hargrove manager and agreed to terms on a three-year contract.' ============================================================ BERT: CLS attention max: 0.098 Self-attention mean: 0.100 Content/Function ratio: 3.94 Attention entropy: 54.168 DistilBERT: CLS attention max: 0.146 Self-attention mean: 0.083 Content/Function ratio: 2.30 Attention entropy: 58.925 RoBERTa: CLS attention max: 0.286 Self-attention mean: 0.135 Content/Function ratio: 35.71 Attention entropy: 56.673 ==================== SENTENCE 43 ==================== ANALYSE PATTERNS: 'British eventers slip back Great Britain slip down to third after the cross-country round of the three-day eventing.' ============================================================ BERT: CLS attention max: 0.107 Self-attention mean: 0.101 Content/Function ratio: 2.08 Attention entropy: 56.799 DistilBERT: CLS attention max: 0.237 Self-attention mean: 0.084 Content/Function ratio: 1.77 Attention entropy: 57.910 RoBERTa: CLS attention max: 0.268 Self-attention mean: 0.133 Content/Function ratio: 37.04 Attention entropy: 54.724 ==================== SENTENCE 44 ==================== ANALYSE PATTERNS: 'Final Preseason Game Important for McMahon (AP) AP - Don't tell Mike McMahon the NFL's final exhibitions are meaningless.' ============================================================ BERT: CLS attention max: 0.100 Self-attention mean: 0.099 Content/Function ratio: 2.34 Attention entropy: 66.570 DistilBERT: CLS attention max: 0.140 Self-attention mean: 0.079 Content/Function ratio: 1.57 Attention entropy: 66.973 RoBERTa: CLS attention max: 0.261 Self-attention mean: 0.132 Content/Function ratio: 37.04 Attention entropy: 56.080 ==================== SENTENCE 45 ==================== ANALYSE PATTERNS: 'Phelps' Trial Set 19-year-old Olympic swimming champion Michael Phelps' drunken driving trial is set for Dec. 29.' ============================================================ BERT: CLS attention max: 0.102 Self-attention mean: 0.104 Content/Function ratio: 1.54 Attention entropy: 52.102 DistilBERT: CLS attention max: 0.172 Self-attention mean: 0.090 Content/Function ratio: 0.98 Attention entropy: 62.051 RoBERTa: CLS attention max: 0.275 Self-attention mean: 0.146 Content/Function ratio: 35.71 Attention entropy: 56.992 ==================== SENTENCE 46 ==================== ANALYSE PATTERNS: 'NL notables The Mets' Jeff Keppinger got his first major league steal in the seventh, swiping second.' ============================================================ BERT: CLS attention max: 0.095 Self-attention mean: 0.091 Content/Function ratio: 3.65 Attention entropy: 54.018 DistilBERT: CLS attention max: 0.200 Self-attention mean: 0.083 Content/Function ratio: 1.66 Attention entropy: 58.853 RoBERTa: CLS attention max: 0.291 Self-attention mean: 0.112 Content/Function ratio: 38.46 Attention entropy: 48.166 ==================== SENTENCE 47 ==================== ANALYSE PATTERNS: 'Sooners stop at nothing DALLAS -- The recruiting battle was as intense as everyone expected. Oklahoma against Texas -- again.' ============================================================ BERT: CLS attention max: 0.123 Self-attention mean: 0.104 Content/Function ratio: 1.78 Attention entropy: 51.506 DistilBERT: CLS attention max: 0.232 Self-attention mean: 0.090 Content/Function ratio: 0.92 Attention entropy: 54.546 RoBERTa: CLS attention max: 0.261 Self-attention mean: 0.125 Content/Function ratio: 37.04 Attention entropy: 54.345 ==================== SENTENCE 48 ==================== ANALYSE PATTERNS: 'Today's schedule Amateur baseball: Yawkey League playoffs -- South Boston vs. Somerville at Ronan Park, Dorchester, 7:30 p.m.' ============================================================ BERT: CLS attention max: 0.108 Self-attention mean: 0.091 Content/Function ratio: 3.15 Attention entropy: 78.316 DistilBERT: CLS attention max: 0.173 Self-attention mean: 0.071 Content/Function ratio: 1.90 Attention entropy: 77.828 RoBERTa: CLS attention max: 0.272 Self-attention mean: 0.106 Content/Function ratio: 2.80 Attention entropy: 73.250 ==================== SENTENCE 49 ==================== ANALYSE PATTERNS: 'Today's schedule College hockey: MEN -- Worcester St. at Wentworth, 8 p.m.; WOMEN -- Rensselaer at MIT, 7 p.m.' ============================================================ BERT: CLS attention max: 0.071 Self-attention mean: 0.088 Content/Function ratio: 1.50 Attention entropy: 91.062 DistilBERT: CLS attention max: 0.160 Self-attention mean: 0.072 Content/Function ratio: 1.51 Attention entropy: 89.079 RoBERTa: CLS attention max: 0.282 Self-attention mean: 0.100 Content/Function ratio: 27.03 Attention entropy: 75.337 ==================== SENTENCE 50 ==================== ANALYSE PATTERNS: 'Martinez Deal Finalized Martinez passes his physical, and the Mets finalize their \$53 million, four-year contract with the pitcher.' ============================================================ BERT: CLS attention max: 0.114 Self-attention mean: 0.092 Content/Function ratio: 4.21 Attention entropy: 59.085 DistilBERT: CLS attention max: 0.180 Self-attention mean: 0.078 Content/Function ratio: 2.12 Attention entropy: 68.812 RoBERTa: CLS attention max: 0.259 Self-attention mean: 0.150 Content/Function ratio: 32.26 Attention entropy: 61.319 ==================== SENTENCE 51 ==================== ANALYSE PATTERNS: 'Red Bull snaps up Jaguar F1 team Energy drink company Red Bull has bought the Jaguar Formula One team.' ============================================================ BERT: CLS attention max: 0.133 Self-attention mean: 0.122 Content/Function ratio: 1.44 Attention entropy: 45.058 DistilBERT: CLS attention max: 0.237 Self-attention mean: 0.091 Content/Function ratio: 1.54 Attention entropy: 42.626 RoBERTa: CLS attention max: 0.307 Self-attention mean: 0.143 Content/Function ratio: 43.48 Attention entropy: 45.069 ==================== SENTENCE 52 ==================== ANALYSE PATTERNS: 'Dell's Secret Earnings Engine The company gets its highest profit margins from a conspicuously old economy business.' ============================================================ BERT: CLS attention max: 0.130 Self-attention mean: 0.107 Content/Function ratio: 1.74 Attention entropy: 46.423 DistilBERT: CLS attention max: 0.216 Self-attention mean: 0.081 Content/Function ratio: 1.62 Attention entropy: 46.890 RoBERTa: CLS attention max: 0.269 Self-attention mean: 0.132 Content/Function ratio: 41.67 Attention entropy: 48.691 ==================== SENTENCE 53 ==================== ANALYSE PATTERNS: 'Which Medications Are Your Best Bets? Consumer Reports' effort to rate drugs offers a lesson to pharmaceutical companies.' ============================================================ BERT: CLS attention max: 0.111 Self-attention mean: 0.101 Content/Function ratio: 2.96 Attention entropy: 43.126 DistilBERT: CLS attention max: 0.188 Self-attention mean: 0.091 Content/Function ratio: 2.39 Attention entropy: 46.230 RoBERTa: CLS attention max: 0.275 Self-attention mean: 0.139 Content/Function ratio: 40.00 Attention entropy: 50.162 ==================== SENTENCE 54 ==================== ANALYSE PATTERNS: 'M'm! M'm! Could Be Better! Campbell Soup turns in a good quarter, but there are better alternatives.' ============================================================ BERT: CLS attention max: 0.121 Self-attention mean: 0.103 Content/Function ratio: 3.56 Attention entropy: 57.247 DistilBERT: CLS attention max: 0.167 Self-attention mean: 0.076 Content/Function ratio: 2.43 Attention entropy: 64.630 RoBERTa: CLS attention max: 0.270 Self-attention mean: 0.139 Content/Function ratio: 38.46 Attention entropy: 50.044 ==================== SENTENCE 55 ==================== ANALYSE PATTERNS: 'Is Santa Skipping Wal-Mart? Plus, few defectors in the wireless war, and Overstock's locked and loaded.' ============================================================ BERT: CLS attention max: 0.100 Self-attention mean: 0.098 Content/Function ratio: 1.74 Attention entropy: 59.411 DistilBERT: CLS attention max: 0.176 Self-attention mean: 0.089 Content/Function ratio: 1.54 Attention entropy: 65.050 RoBERTa: CLS attention max: 0.243 Self-attention mean: 0.124 Content/Function ratio: 1.69 Attention entropy: 59.441 ==================== SENTENCE 56 ==================== ANALYSE PATTERNS: 'Whole Foods' Healthy Outlook The natural foods chain is predicting double-digit sales growth until 2010.' ============================================================ BERT: CLS attention max: 0.105 Self-attention mean: 0.118 Content/Function ratio: 1.86 Attention entropy: 40.640 DistilBERT: CLS attention max: 0.222 Self-attention mean: 0.093 Content/Function ratio: 1.57 Attention entropy: 39.633 RoBERTa: CLS attention max: 0.285 Self-attention mean: 0.152 Content/Function ratio: 45.45 Attention entropy: 41.706 ==================== SENTENCE 57 ==================== ANALYSE PATTERNS: 'A License to Print Money Will Coinstar's diversification efforts damage its uniquely profitable business model?' ============================================================ BERT: CLS attention max: 0.151 Self-attention mean: 0.118 Content/Function ratio: 0.90 Attention entropy: 36.281 DistilBERT: CLS attention max: 0.239 Self-attention mean: 0.091 Content/Function ratio: 0.57 Attention entropy: 35.517 RoBERTa: CLS attention max: 0.336 Self-attention mean: 0.155 Content/Function ratio: 1.51 Attention entropy: 40.065 ==================== SENTENCE 58 ==================== ANALYSE PATTERNS: 'Palestinian economy in decline The Palestinian economy is in crisis, performing well below its potential, the World Bank says.' ============================================================ BERT: CLS attention max: 0.136 Self-attention mean: 0.099 Content/Function ratio: 2.20 Attention entropy: 47.137 DistilBERT: CLS attention max: 0.200 Self-attention mean: 0.078 Content/Function ratio: 2.62 Attention entropy: 50.624 RoBERTa: CLS attention max: 0.276 Self-attention mean: 0.120 Content/Function ratio: 41.67 Attention entropy: 46.551 ==================== SENTENCE 59 ==================== ANALYSE PATTERNS: 'Synnex's World Isn't Flat The company is moving along a dual track -- growth through acquisitions and organic efforts.' ============================================================ BERT: CLS attention max: 0.102 Self-attention mean: 0.103 Content/Function ratio: 2.46 Attention entropy: 64.966 DistilBERT: CLS attention max: 0.163 Self-attention mean: 0.079 Content/Function ratio: 1.82 Attention entropy: 67.154 RoBERTa: CLS attention max: 0.247 Self-attention mean: 0.122 Content/Function ratio: 40.00 Attention entropy: 51.148 ==================== SENTENCE 60 ==================== ANALYSE PATTERNS: 'Rising material costs hit Heinz Second quarter profits at ketchup maker Heinz are hit by higher material and transport costs.' ============================================================ BERT: CLS attention max: 0.132 Self-attention mean: 0.120 Content/Function ratio: 1.54 Attention entropy: 49.509 DistilBERT: CLS attention max: 0.252 Self-attention mean: 0.087 Content/Function ratio: 1.71 Attention entropy: 48.379 RoBERTa: CLS attention max: 0.282 Self-attention mean: 0.126 Content/Function ratio: 37.04 Attention entropy: 51.436 ==================== SENTENCE 61 ==================== ANALYSE PATTERNS: 'Profit From Management Integrity Laser manufacturer Candela's management avoids taking the easy way out to explain shortfalls.' ============================================================ BERT: CLS attention max: 0.107 Self-attention mean: 0.111 Content/Function ratio: 2.31 Attention entropy: 50.228 DistilBERT: CLS attention max: 0.184 Self-attention mean: 0.086 Content/Function ratio: 2.02 Attention entropy: 54.418 RoBERTa: CLS attention max: 0.260 Self-attention mean: 0.157 Content/Function ratio: 41.67 Attention entropy: 48.239 ==================== SENTENCE 62 ==================== ANALYSE PATTERNS: 'United Airlines imposes wage cuts America's second largest airline announces widespread pay cuts as it strives to emerge from bankruptcy.' ============================================================ BERT: CLS attention max: 0.145 Self-attention mean: 0.095 Content/Function ratio: 24.88 Attention entropy: 50.851 DistilBERT: CLS attention max: 0.220 Self-attention mean: 0.078 Content/Function ratio: 30.40 Attention entropy: 53.361 RoBERTa: CLS attention max: 0.288 Self-attention mean: 0.135 Content/Function ratio: 41.67 Attention entropy: 46.866 ==================== SENTENCE 63 ==================== ANALYSE PATTERNS: 'Is Disney a Growth Stock? Plus, Mel's Sirius decision, Phil hangs up his Nikes, and Mattel's "free plus" dividend.' ============================================================ BERT: CLS attention max: 0.072 Self-attention mean: 0.085 Content/Function ratio: 1.81 Attention entropy: 84.554 DistilBERT: CLS attention max: 0.155 Self-attention mean: 0.076 Content/Function ratio: 2.46 Attention entropy: 86.452 RoBERTa: CLS attention max: 0.231 Self-attention mean: 0.113 Content/Function ratio: 2.32 Attention entropy: 68.335 ==================== SENTENCE 64 ==================== ANALYSE PATTERNS: 'PDL Rakes It In A robust revenue stream combined with an exciting drug pipeline is the recipe for success.' ============================================================ BERT: CLS attention max: 0.104 Self-attention mean: 0.107 Content/Function ratio: 2.53 Attention entropy: 50.046 DistilBERT: CLS attention max: 0.238 Self-attention mean: 0.086 Content/Function ratio: 2.42 Attention entropy: 50.803 RoBERTa: CLS attention max: 0.287 Self-attention mean: 0.152 Content/Function ratio: 41.67 Attention entropy: 48.186 ==================== SENTENCE 65 ==================== ANALYSE PATTERNS: 'Emisphere Wins Novartis Over Emisphere warrants close attention, although massive profits are not in the near-term cards.' ============================================================ BERT: CLS attention max: 0.104 Self-attention mean: 0.102 Content/Function ratio: 3.47 Attention entropy: 61.267 DistilBERT: CLS attention max: 0.237 Self-attention mean: 0.079 Content/Function ratio: 2.63 Attention entropy: 61.966 RoBERTa: CLS attention max: 0.264 Self-attention mean: 0.130 Content/Function ratio: 2.60 Attention entropy: 53.318 ==================== SENTENCE 66 ==================== ANALYSE PATTERNS: 'Pricey Gas Stalls AutoZone The retailer posts a flat first quarter, claiming high gas prices affect consumers' car budgets.' ============================================================ BERT: CLS attention max: 0.086 Self-attention mean: 0.119 Content/Function ratio: 2.04 Attention entropy: 55.576 DistilBERT: CLS attention max: 0.172 Self-attention mean: 0.086 Content/Function ratio: 1.70 Attention entropy: 56.851 RoBERTa: CLS attention max: 0.265 Self-attention mean: 0.135 Content/Function ratio: 37.04 Attention entropy: 55.092 ==================== SENTENCE 67 ==================== ANALYSE PATTERNS: 'One Really Ugly Mark on Star Gas Inscribe this stock with "losing customers, bad debt terms, and, maybe, bankruptcy."' ============================================================ BERT: CLS attention max: 0.108 Self-attention mean: 0.105 Content/Function ratio: 2.69 Attention entropy: 70.515 DistilBERT: CLS attention max: 0.179 Self-attention mean: 0.082 Content/Function ratio: 1.38 Attention entropy: 74.483 RoBERTa: CLS attention max: 0.263 Self-attention mean: 0.122 Content/Function ratio: 33.33 Attention entropy: 63.683 ==================== SENTENCE 68 ==================== ANALYSE PATTERNS: 'High oil prices hit China growth Rising oil prices are expected to hit China's growth rate this year.' ============================================================ BERT: CLS attention max: 0.092 Self-attention mean: 0.111 Content/Function ratio: 1.80 Attention entropy: 46.162 DistilBERT: CLS attention max: 0.245 Self-attention mean: 0.084 Content/Function ratio: 1.26 Attention entropy: 45.522 RoBERTa: CLS attention max: 0.271 Self-attention mean: 0.123 Content/Function ratio: 45.45 Attention entropy: 43.725 ==================== SENTENCE 69 ==================== ANALYSE PATTERNS: 'Bowes Takes a Bow Pitney Bowes always seems to mail it in -- and that's not necessarily a bad thing.' ============================================================ BERT: CLS attention max: 0.130 Self-attention mean: 0.090 Content/Function ratio: 3.08 Attention entropy: 51.124 DistilBERT: CLS attention max: 0.206 Self-attention mean: 0.080 Content/Function ratio: 2.54 Attention entropy: 63.539 RoBERTa: CLS attention max: 0.266 Self-attention mean: 0.129 Content/Function ratio: 37.04 Attention entropy: 52.537 ==================== SENTENCE 70 ==================== ANALYSE PATTERNS: 'ADV: \$150,000 Mortgage for Under \$690/Month Mortgage rates are at record lows. Save \$1000s on your mortgage payment. Free quotes.' ============================================================ BERT: CLS attention max: 0.090 Self-attention mean: 0.103 Content/Function ratio: 1.56 Attention entropy: 95.987 DistilBERT: CLS attention max: 0.144 Self-attention mean: 0.072 Content/Function ratio: 1.42 Attention entropy: 88.378 RoBERTa: CLS attention max: 0.213 Self-attention mean: 0.127 Content/Function ratio: 26.32 Attention entropy: 84.686 ==================== SENTENCE 71 ==================== ANALYSE PATTERNS: 'Wal-Mart Sees Lackluster November Sales (Reuters) Reuters - Wal-Mart Stores Inc. , the\world's largest retailer, slashed its own expectations of' ============================================================ BERT: CLS attention max: 0.083 Self-attention mean: 0.091 Content/Function ratio: 3.09 Attention entropy: 81.875 DistilBERT: CLS attention max: 0.142 Self-attention mean: 0.070 Content/Function ratio: 1.56 Attention entropy: 90.060 RoBERTa: CLS attention max: 0.275 Self-attention mean: 0.126 Content/Function ratio: 28.57 Attention entropy: 76.071 ==================== SENTENCE 72 ==================== ANALYSE PATTERNS: 'Don't Listen to Buffett Not all the time, anyway. Moneyball author Michael Lewis says conventional wisdom creates inefficiencies.' ============================================================ BERT: CLS attention max: 0.110 Self-attention mean: 0.102 Content/Function ratio: 5.55 Attention entropy: 60.896 DistilBERT: CLS attention max: 0.141 Self-attention mean: 0.082 Content/Function ratio: 3.11 Attention entropy: 78.527 RoBERTa: CLS attention max: 0.194 Self-attention mean: 0.121 Content/Function ratio: 37.04 Attention entropy: 60.009 ==================== SENTENCE 73 ==================== ANALYSE PATTERNS: 'In Pursuit of Happiness One Fool experiences more than two hours of lost pre-party productivity in a Barnes Noble quest.' ============================================================ BERT: CLS attention max: 0.118 Self-attention mean: 0.098 Content/Function ratio: 2.11 Attention entropy: 48.448 DistilBERT: CLS attention max: 0.199 Self-attention mean: 0.080 Content/Function ratio: 2.01 Attention entropy: 53.439 RoBERTa: CLS attention max: 0.255 Self-attention mean: 0.135 Content/Function ratio: 1.88 Attention entropy: 54.234 ==================== SENTENCE 74 ==================== ANALYSE PATTERNS: 'Middle Class America The Post's Jonathan Weisman discusses the increasing importance of temporary employment to the American economy.' ============================================================ BERT: CLS attention max: 0.099 Self-attention mean: 0.111 Content/Function ratio: 2.97 Attention entropy: 42.892 DistilBERT: CLS attention max: 0.213 Self-attention mean: 0.088 Content/Function ratio: 1.98 Attention entropy: 46.119 RoBERTa: CLS attention max: 0.285 Self-attention mean: 0.134 Content/Function ratio: 43.48 Attention entropy: 45.155 ==================== SENTENCE 75 ==================== ANALYSE PATTERNS: 'Stern and Letterman Get Sirius Stern has a date with Letterman tonight, and you can expect fireworks.' ============================================================ BERT: CLS attention max: 0.109 Self-attention mean: 0.116 Content/Function ratio: 3.30 Attention entropy: 43.898 DistilBERT: CLS attention max: 0.183 Self-attention mean: 0.094 Content/Function ratio: 3.89 Attention entropy: 45.688 RoBERTa: CLS attention max: 0.273 Self-attention mean: 0.144 Content/Function ratio: 41.67 Attention entropy: 49.072 ==================== SENTENCE 76 ==================== ANALYSE PATTERNS: 'Nortel to lay off 3,500 The scandal-beset company will also lay off about 10 percent of its work force.' ============================================================ BERT: CLS attention max: 0.091 Self-attention mean: 0.111 Content/Function ratio: 2.71 Attention entropy: 52.716 DistilBERT: CLS attention max: 0.195 Self-attention mean: 0.085 Content/Function ratio: 1.89 Attention entropy: 59.245 RoBERTa: CLS attention max: 0.282 Self-attention mean: 0.129 Content/Function ratio: 34.48 Attention entropy: 56.232 ==================== SENTENCE 77 ==================== ANALYSE PATTERNS: 'Microsoft Eyes Lighter Versions of Longhorn Operating systems would be designed for specific server tasks, company says.' ============================================================ BERT: CLS attention max: 0.107 Self-attention mean: 0.114 Content/Function ratio: 2.03 Attention entropy: 46.024 DistilBERT: CLS attention max: 0.174 Self-attention mean: 0.092 Content/Function ratio: 2.36 Attention entropy: 45.003 RoBERTa: CLS attention max: 0.275 Self-attention mean: 0.117 Content/Function ratio: 41.67 Attention entropy: 50.254 ==================== SENTENCE 78 ==================== ANALYSE PATTERNS: 'Tokyo Edge: New Choices in Digital Entertainment PC and home theater make sleek package, while new portable music devices abound.' ============================================================ BERT: CLS attention max: 0.099 Self-attention mean: 0.098 Content/Function ratio: 1.96 Attention entropy: 57.856 DistilBERT: CLS attention max: 0.121 Self-attention mean: 0.083 Content/Function ratio: 1.84 Attention entropy: 60.955 RoBERTa: CLS attention max: 0.255 Self-attention mean: 0.104 Content/Function ratio: 37.04 Attention entropy: 58.760 ==================== SENTENCE 79 ==================== ANALYSE PATTERNS: 'First Look: Skip Gateway's MP3 Photo Jukebox Color display and photo support can't save oddly designed player.' ============================================================ BERT: CLS attention max: 0.098 Self-attention mean: 0.098 Content/Function ratio: 28.83 Attention entropy: 54.808 DistilBERT: CLS attention max: 0.135 Self-attention mean: 0.087 Content/Function ratio: 35.57 Attention entropy: 64.080 RoBERTa: CLS attention max: 0.245 Self-attention mean: 0.112 Content/Function ratio: 38.46 Attention entropy: 56.762 ==================== SENTENCE 80 ==================== ANALYSE PATTERNS: 'Reg readers name BSA antipiracy weasel Poll result The people have spoken' ============================================================ BERT: CLS attention max: 0.114 Self-attention mean: 0.150 Content/Function ratio: 0.96 Attention entropy: 28.237 DistilBERT: CLS attention max: 0.281 Self-attention mean: 0.109 Content/Function ratio: 0.79 Attention entropy: 28.323 RoBERTa: CLS attention max: 0.359 Self-attention mean: 0.147 Content/Function ratio: 55.56 Attention entropy: 32.112 ==================== SENTENCE 81 ==================== ANALYSE PATTERNS: 'GeekTech: Here Comes BTX New industry standard should offer cooler, quieter systems--so why isn't anybody rushing to embrace it?' ============================================================ BERT: CLS attention max: 0.117 Self-attention mean: 0.090 Content/Function ratio: 22.79 Attention entropy: 62.549 DistilBERT: CLS attention max: 0.146 Self-attention mean: 0.086 Content/Function ratio: 23.51 Attention entropy: 68.407 RoBERTa: CLS attention max: 0.308 Self-attention mean: 0.141 Content/Function ratio: 33.33 Attention entropy: 59.569 ==================== SENTENCE 82 ==================== ANALYSE PATTERNS: 'Microsoft Readies Windows Server 2003 Update R2, an interim release, will begin beta testing later this month.' ============================================================ BERT: CLS attention max: 0.093 Self-attention mean: 0.109 Content/Function ratio: 3.68 Attention entropy: 52.107 DistilBERT: CLS attention max: 0.191 Self-attention mean: 0.086 Content/Function ratio: 4.77 Attention entropy: 49.449 RoBERTa: CLS attention max: 0.277 Self-attention mean: 0.130 Content/Function ratio: 41.67 Attention entropy: 49.618 ==================== SENTENCE 83 ==================== ANALYSE PATTERNS: 'Microsoft sees bespoke Windows everywhere Analysis Premium hand-tuning service available' ============================================================ BERT: CLS attention max: 0.174 Self-attention mean: 0.139 Content/Function ratio: 30.11 Attention entropy: 28.086 DistilBERT: CLS attention max: 0.282 Self-attention mean: 0.115 Content/Function ratio: 28.85 Attention entropy: 27.557 RoBERTa: CLS attention max: 0.383 Self-attention mean: 0.143 Content/Function ratio: 62.50 Attention entropy: 30.654 ==================== SENTENCE 84 ==================== ANALYSE PATTERNS: 'Virus targets 64-bit Windows Digital pest prototype infects files only found in early Windows code for AMD 64-bit Opteron processors.' ============================================================ BERT: CLS attention max: 0.123 Self-attention mean: 0.095 Content/Function ratio: 1.88 Attention entropy: 64.211 DistilBERT: CLS attention max: 0.186 Self-attention mean: 0.080 Content/Function ratio: 2.11 Attention entropy: 67.948 RoBERTa: CLS attention max: 0.234 Self-attention mean: 0.111 Content/Function ratio: 2.53 Attention entropy: 67.480 ==================== SENTENCE 85 ==================== ANALYSE PATTERNS: 'Sony camera blends photos, video The company's latest digital camera includes advanced video features.' ============================================================ BERT: CLS attention max: 0.130 Self-attention mean: 0.113 Content/Function ratio: 1.80 Attention entropy: 40.820 DistilBERT: CLS attention max: 0.229 Self-attention mean: 0.086 Content/Function ratio: 2.22 Attention entropy: 42.075 RoBERTa: CLS attention max: 0.306 Self-attention mean: 0.124 Content/Function ratio: 52.63 Attention entropy: 36.942 ==================== SENTENCE 86 ==================== ANALYSE PATTERNS: 'Triumphant return of the big Reg logo t-shirt Cash'n'Carrion Cue trumpets, etc' ============================================================ BERT: CLS attention max: 0.150 Self-attention mean: 0.118 Content/Function ratio: 1.01 Attention entropy: 39.601 DistilBERT: CLS attention max: 0.231 Self-attention mean: 0.100 Content/Function ratio: 1.13 Attention entropy: 43.212 RoBERTa: CLS attention max: 0.337 Self-attention mean: 0.130 Content/Function ratio: 38.46 Attention entropy: 51.250 ==================== SENTENCE 87 ==================== ANALYSE PATTERNS: 'Worms may slow Parkinson's A protein which helps increase lifespan in worms offers hope for new Parkinson's and Alzheimer's treatments.' ============================================================ BERT: CLS attention max: 0.110 Self-attention mean: 0.101 Content/Function ratio: 2.27 Attention entropy: 62.052 DistilBERT: CLS attention max: 0.204 Self-attention mean: 0.078 Content/Function ratio: 3.67 Attention entropy: 67.003 RoBERTa: CLS attention max: 0.268 Self-attention mean: 0.142 Content/Function ratio: 35.71 Attention entropy: 57.050 ==================== SENTENCE 88 ==================== ANALYSE PATTERNS: 'FCC Moves Toward Voice, Data, Broadband on Planes Agency to auction licenses for communications; consider cell phone use during flights.' ============================================================ BERT: CLS attention max: 0.127 Self-attention mean: 0.113 Content/Function ratio: 1.03 Attention entropy: 51.383 DistilBERT: CLS attention max: 0.201 Self-attention mean: 0.091 Content/Function ratio: 2.38 Attention entropy: 54.816 RoBERTa: CLS attention max: 0.247 Self-attention mean: 0.102 Content/Function ratio: 34.48 Attention entropy: 65.139 ==================== SENTENCE 89 ==================== ANALYSE PATTERNS: 'Thumb twiddling on cybersecurity Congresswoman Zoe Lofgren says bureaucratic miscues continue to hamstring serious government action to combat cyberattacks.' ============================================================ BERT: CLS attention max: 0.120 Self-attention mean: 0.092 Content/Function ratio: 1.82 Attention entropy: 76.549 DistilBERT: CLS attention max: 0.209 Self-attention mean: 0.071 Content/Function ratio: 1.34 Attention entropy: 80.111 RoBERTa: CLS attention max: 0.251 Self-attention mean: 0.118 Content/Function ratio: 3.31 Attention entropy: 59.397 ==================== SENTENCE 90 ==================== ANALYSE PATTERNS: 'Last Xmas order date for the Antipodes Cash'n'Carrion Get 'em in by Sunday' ============================================================ BERT: CLS attention max: 0.200 Self-attention mean: 0.127 Content/Function ratio: 0.91 Attention entropy: 43.208 DistilBERT: CLS attention max: 0.289 Self-attention mean: 0.104 Content/Function ratio: 0.85 Attention entropy: 44.560 RoBERTa: CLS attention max: 0.358 Self-attention mean: 0.136 Content/Function ratio: 40.00 Attention entropy: 47.427 ==================== SENTENCE 91 ==================== ANALYSE PATTERNS: 'Alleged Apple Flash iPod 'partner' signs with Rio SigmaTel's chips claimed to have won Apple's support' ============================================================ BERT: CLS attention max: 0.152 Self-attention mean: 0.111 Content/Function ratio: 17.41 Attention entropy: 48.011 DistilBERT: CLS attention max: 0.249 Self-attention mean: 0.085 Content/Function ratio: 20.24 Attention entropy: 49.751 RoBERTa: CLS attention max: 0.327 Self-attention mean: 0.147 Content/Function ratio: 38.46 Attention entropy: 53.579 ==================== SENTENCE 92 ==================== ANALYSE PATTERNS: 'Older Windows OSes need critical patch Microsoft releases critical Explorer patch VNUNet.com' ============================================================ BERT: CLS attention max: 0.162 Self-attention mean: 0.128 Content/Function ratio: 22.65 Attention entropy: 33.761 DistilBERT: CLS attention max: 0.266 Self-attention mean: 0.110 Content/Function ratio: 20.40 Attention entropy: 29.918 RoBERTa: CLS attention max: 0.254 Self-attention mean: 0.134 Content/Function ratio: 47.62 Attention entropy: 45.713 ==================== SENTENCE 93 ==================== ANALYSE PATTERNS: 'Photo: XM's portable satellite radio XM Satellite Radio Holdings introduced a handheld portable version of its satellite radio.' ============================================================ BERT: CLS attention max: 0.117 Self-attention mean: 0.102 Content/Function ratio: 2.43 Attention entropy: 51.215 DistilBERT: CLS attention max: 0.172 Self-attention mean: 0.082 Content/Function ratio: 2.18 Attention entropy: 55.097 RoBERTa: CLS attention max: 0.288 Self-attention mean: 0.121 Content/Function ratio: 40.00 Attention entropy: 50.175 ==================== SENTENCE 94 ==================== ANALYSE PATTERNS: 'US cyber security chief resigns The man charged with making US computer networks safer has resigned suddenly.' ============================================================ BERT: CLS attention max: 0.172 Self-attention mean: 0.128 Content/Function ratio: 1.65 Attention entropy: 40.506 DistilBERT: CLS attention max: 0.209 Self-attention mean: 0.101 Content/Function ratio: 1.68 Attention entropy: 41.638 RoBERTa: CLS attention max: 0.296 Self-attention mean: 0.132 Content/Function ratio: 47.62 Attention entropy: 40.024 ==================== SENTENCE 95 ==================== ANALYSE PATTERNS: 'McAfee Enhances Spyware Protection (PC World) PC World - Antivirus company offers improved anti-spyware app to users for a fee.' ============================================================ BERT: CLS attention max: 0.118 Self-attention mean: 0.122 Content/Function ratio: 2.30 Attention entropy: 76.505 DistilBERT: CLS attention max: 0.213 Self-attention mean: 0.078 Content/Function ratio: 2.50 Attention entropy: 73.708 RoBERTa: CLS attention max: 0.251 Self-attention mean: 0.144 Content/Function ratio: 29.41 Attention entropy: 75.294 ==================== SENTENCE 96 ==================== ANALYSE PATTERNS: 'Arm Holdings buys US tech firm Microprocessor designer Arm Holdings buys US tech firm Artisan for about \$913m.' ============================================================ BERT: CLS attention max: 0.103 Self-attention mean: 0.096 Content/Function ratio: 28.26 Attention entropy: 57.627 DistilBERT: CLS attention max: 0.190 Self-attention mean: 0.087 Content/Function ratio: 28.14 Attention entropy: 56.872 RoBERTa: CLS attention max: 0.278 Self-attention mean: 0.124 Content/Function ratio: 37.04 Attention entropy: 51.233 ==================== SENTENCE 97 ==================== ANALYSE PATTERNS: 'Outsourcing Finds Vietnam Vietnam is making a big push to turn itself into an outsourcing powerhouse.' ============================================================ BERT: CLS attention max: 0.115 Self-attention mean: 0.106 Content/Function ratio: 2.41 Attention entropy: 46.226 DistilBERT: CLS attention max: 0.245 Self-attention mean: 0.086 Content/Function ratio: 2.36 Attention entropy: 44.096 RoBERTa: CLS attention max: 0.290 Self-attention mean: 0.138 Content/Function ratio: 45.45 Attention entropy: 44.414 ==================== SENTENCE 98 ==================== ANALYSE PATTERNS: 'Watchdog attacks ID card scheme Proposals for identity cards and a population register are opposed by Britain's information watchdog.' ============================================================ BERT: CLS attention max: 0.096 Self-attention mean: 0.109 Content/Function ratio: 1.74 Attention entropy: 53.928 DistilBERT: CLS attention max: 0.239 Self-attention mean: 0.078 Content/Function ratio: 1.73 Attention entropy: 50.667 RoBERTa: CLS attention max: 0.279 Self-attention mean: 0.135 Content/Function ratio: 38.46 Attention entropy: 53.603 ==================== SENTENCE 99 ==================== ANALYSE PATTERNS: 'Salesforce.com reports subscriber surge The subscription software company adds 85,000 individual subscribers to its online customer information system.' ============================================================ BERT: CLS attention max: 0.098 Self-attention mean: 0.106 Content/Function ratio: 2.21 Attention entropy: 61.773 DistilBERT: CLS attention max: 0.219 Self-attention mean: 0.072 Content/Function ratio: 1.80 Attention entropy: 58.843 RoBERTa: CLS attention max: 0.287 Self-attention mean: 0.140 Content/Function ratio: 38.46 Attention entropy: 54.408 ==================== SENTENCE 100 ==================== ANALYSE PATTERNS: 'Will historic flight launch space tourism? Regardless, space competitions are poised to become big business.' ============================================================ BERT: CLS attention max: 0.091 Self-attention mean: 0.118 Content/Function ratio: 2.11 Attention entropy: 43.097 DistilBERT: CLS attention max: 0.189 Self-attention mean: 0.093 Content/Function ratio: 1.71 Attention entropy: 42.230 RoBERTa: CLS attention max: 0.285 Self-attention mean: 0.132 Content/Function ratio: 50.00 Attention entropy: 39.187
def visualize_attention_patterns(all_patterns, sentence_domains):
"""Visualize attention patterns across models and sentences"""
metrics = ["cls_attention_max", "self_attention_mean", "content_vs_function", "attention_entropy"]
metric_names = ["CLS Attention Max", "Self-Attention Mean", "Content/Function Ratio", "Entropy"]
models = list(models_data.keys())
sentence_keys = list(all_patterns.keys())
# 1. GRAPHICS BY METRIC (overall averages per model)
print("\n OVERALL AVERAGE PATTERNS:")
fig = make_subplots(
rows=2, cols=2,
subplot_titles=metric_names,
specs=[[{"type": "bar"}, {"type": "bar"}],
[{"type": "bar"}, {"type": "bar"}]]
)
positions = [(1,1), (1,2), (2,1), (2,2)]
for idx, (metric, metric_name) in enumerate(zip(metrics, metric_names)):
row, col = positions[idx]
model_means = {}
for model in models:
values = [all_patterns[sentence_key][model][metric] for sentence_key in sentence_keys]
model_means[model] = np.mean(values)
colors = [MODELS[model]["color"] for model in models]
fig.add_trace(
go.Bar(
x=list(model_means.keys()),
y=list(model_means.values()),
marker_color=colors,
name=metric_name,
showlegend=False
),
row=row, col=col
)
fig.update_layout(
title_text="Attention Patterns per Model (Overall Averages)",
title_x=0.5,
height=600
)
fig.show()
# 2. TABLE BY DOMAIN
print("\n TABLE BY DOMAIN:")
domain_data = []
for sentence_key, domain in zip(sentence_keys, sentence_domains):
for model in models:
for metric in metrics:
domain_data.append({
"Domain": domain,
"Model": model,
"Metric": metric,
"Value": all_patterns[sentence_key][model][metric]
})
domain_df = pd.DataFrame(domain_data)
pivot_table = domain_df.pivot_table(
values='Value',
index=['Domain', 'Model'],
columns='Metric',
aggfunc='mean'
).round(3)
print("\nAverage metrics by domain and model:")
display(pivot_table)
# 3. TABLE BY SENTENCES
print("\n TABLE BY SENTENCES:")
pattern_data = []
for i, (sentence_key, domain) in enumerate(zip(sentence_keys, sentence_domains)):
for model in models:
row = {
"Sentence": f"S{i+1}",
"Domain": domain,
"Model": model
}
for metric in metrics:
row[metric] = f"{all_patterns[sentence_key][model][metric]:.3f}"
pattern_data.append(row)
pattern_df = pd.DataFrame(pattern_data)
display(pattern_df)
# 4. COMPARATIVE CHART BY DOMAIN
print("\n DOMAIN COMPARISON:")
fig_domain = make_subplots(
rows=2, cols=2,
subplot_titles=metric_names,
specs=[[{"type": "bar"}, {"type": "bar"}],
[{"type": "bar"}, {"type": "bar"}]]
)
unique_domains = sorted(set(sentence_domains))
for idx, (metric, metric_name) in enumerate(zip(metrics, metric_names)):
row, col = positions[idx]
for model in models:
domain_means = []
for domain in unique_domains:
# Filter data for this domain
domain_values = [
all_patterns[sentence_key][model][metric]
for sentence_key, d in zip(sentence_keys, sentence_domains)
if d == domain
]
domain_means.append(np.mean(domain_values))
fig_domain.add_trace(
go.Bar(
x=unique_domains,
y=domain_means,
name=model,
marker_color=MODELS[model]["color"],
showlegend=(idx == 0) # Show legend only in first subplot
),
row=row, col=col
)
fig_domain.update_layout(
title_text="Attention Patterns by Domain",
title_x=0.5,
height=700,
barmode='group'
)
fig_domain.show()
return fig, pattern_df, pivot_table
patterns_fig, patterns_df, domain_pivot = visualize_attention_patterns(all_patterns, SENTENCE_DOMAINS)
OVERALL AVERAGE PATTERNS:
TABLE BY DOMAIN: Average metrics by domain and model:
| Metric | attention_entropy | cls_attention_max | content_vs_function | self_attention_mean | |
|---|---|---|---|---|---|
| Domain | Model | ||||
| Business | BERT | 55.365 | 0.111 | 3.344 | 0.105 |
| DistilBERT | 58.253 | 0.197 | 3.143 | 0.083 | |
| RoBERTa | 53.341 | 0.267 | 31.708 | 0.133 | |
| Sci/Tech | BERT | 50.914 | 0.123 | 7.479 | 0.112 |
| DistilBERT | 52.223 | 0.211 | 7.841 | 0.089 | |
| RoBERTa | 51.451 | 0.289 | 38.574 | 0.129 | |
| Sports | BERT | 61.612 | 0.106 | 2.803 | 0.101 |
| DistilBERT | 65.759 | 0.173 | 2.013 | 0.083 | |
| RoBERTa | 59.828 | 0.269 | 27.725 | 0.130 | |
| World | BERT | 55.811 | 0.127 | 5.519 | 0.104 |
| DistilBERT | 57.042 | 0.182 | 5.542 | 0.082 | |
| RoBERTa | 54.705 | 0.274 | 33.588 | 0.128 |
TABLE BY SENTENCES:
| Sentence | Domain | Model | cls_attention_max | self_attention_mean | content_vs_function | attention_entropy | |
|---|---|---|---|---|---|---|---|
| 0 | S1 | World | BERT | 0.108 | 0.088 | 1.349 | 61.730 |
| 1 | S1 | World | DistilBERT | 0.171 | 0.070 | 0.888 | 62.514 |
| 2 | S1 | World | RoBERTa | 0.281 | 0.128 | 3.808 | 60.199 |
| 3 | S2 | World | BERT | 0.102 | 0.112 | 1.587 | 53.609 |
| 4 | S2 | World | DistilBERT | 0.196 | 0.083 | 2.178 | 50.663 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 295 | S99 | Sci/Tech | DistilBERT | 0.219 | 0.072 | 1.799 | 58.843 |
| 296 | S99 | Sci/Tech | RoBERTa | 0.287 | 0.140 | 38.462 | 54.408 |
| 297 | S100 | Sci/Tech | BERT | 0.091 | 0.118 | 2.107 | 43.097 |
| 298 | S100 | Sci/Tech | DistilBERT | 0.189 | 0.093 | 1.711 | 42.230 |
| 299 | S100 | Sci/Tech | RoBERTa | 0.285 | 0.132 | 50.000 | 39.187 |
300 rows × 7 columns
DOMAIN COMPARISON:
import time
import psutil
import gc
def benchmark_model_performance(models_data, test_sentences):
"""Benchmark speed and memory usage of models."""
print("BENCHMARK PERFORMANCE MODELS")
print("=" * 50)
results = []
for model_name, data in models_data.items():
print(f"\nTest {model_name}...")
tokenizer = data["tokenizer"]
model = data["model"]
times = []
memory_usage = []
for sentence in test_sentences:
gc.collect()
memory_before = psutil.Process().memory_info().rss / 1024 / 1024 # MB
start_time = time.time()
with torch.no_grad():
inputs = tokenizer(sentence, return_tensors="pt", truncation=True)
outputs = model(**inputs)
end_time = time.time()
inference_time = end_time - start_time
times.append(inference_time)
memory_after = psutil.Process().memory_info().rss / 1024 / 1024 # MB
memory_usage.append(memory_after - memory_before)
avg_time = np.mean(times)
std_time = np.std(times)
avg_memory = np.mean(memory_usage)
total_params = sum(p.numel() for p in model.parameters())
result = {
"Model": model_name,
"Avg Time (ms)": f"{avg_time*1000:.2f}",
"Écart-type (ms)": f"{std_time*1000:.2f}",
"Memory (MB)": f"{avg_memory:.1f}",
"Parameters": f"{total_params:,}",
"Speed Relative": 1.0 # Will be calculated later
}
results.append(result)
print(f" Avg Time: {avg_time*1000:.2f} ms")
print(f" Memory: {avg_memory:.1f} MB")
print(f" Parameters: {total_params:,}")
# Speed relative (BERT = baseline)
bert_time = float(results[0]["Avg Time (ms)"].replace(' ms', ''))
for result in results:
model_time = float(result["Avg Time (ms)"].replace(' ms', ''))
result["Speed Relative"] = f"{bert_time/model_time:.2f}x"
return pd.DataFrame(results)
performance_df = benchmark_model_performance(models_data, TEST_SENTENCES)
print("\n BENCHMARK RESULTS:")
display(performance_df)
BENCHMARK PERFORMANCE MODELS ================================================== Test BERT... Avg Time: 44.25 ms Memory: 0.5 MB Parameters: 109,482,240 Test DistilBERT... Avg Time: 22.20 ms Memory: 0.0 MB Parameters: 66,362,880 Test RoBERTa... Avg Time: 44.05 ms Memory: 0.4 MB Parameters: 124,645,632 BENCHMARK RESULTS:
| Model | Avg Time (ms) | Écart-type (ms) | Memory (MB) | Parameters | Speed Relative | |
|---|---|---|---|---|---|---|
| 0 | BERT | 44.25 | 3.65 | 0.5 | 109,482,240 | 1.00x |
| 1 | DistilBERT | 22.20 | 0.98 | 0.0 | 66,362,880 | 1.99x |
| 2 | RoBERTa | 44.05 | 2.67 | 0.4 | 124,645,632 | 1.00x |
def create_performance_charts(performance_df):
"""Create performance comparison charts"""
models = performance_df["Model"].values
times = [float(x.replace(' ms', '')) for x in performance_df["Avg Time (ms)"].values]
memory = [float(x.replace(' MB', '')) for x in performance_df["Memory (MB)"].values]
params = [int(x.replace(',', '')) for x in performance_df["Parameters"].values]
colors = [MODELS[model]["color"] for model in models]
fig = make_subplots(
rows=2, cols=2,
subplot_titles=("Inference Time", "Memory Usage",
"Parameter Count", "Speed vs Parameters"),
specs=[[{"type": "bar"}, {"type": "bar"}],
[{"type": "bar"}, {"type": "scatter"}]]
)
# Chart 1: Time
fig.add_trace(
go.Bar(x=models, y=times, marker_color=colors, name="Time", showlegend=False),
row=1, col=1
)
# Chart 2: Memory
fig.add_trace(
go.Bar(x=models, y=memory, marker_color=colors, name="Memory", showlegend=False),
row=1, col=2
)
# Chart 3: Parameters
fig.add_trace(
go.Bar(x=models, y=params, marker_color=colors, name="Parameters", showlegend=False),
row=2, col=1
)
# Chart 4: Trade-off
fig.add_trace(
go.Scatter(
x=times, y=params,
mode='markers+text',
marker=dict(size=15, color=colors),
text=models,
textposition="top center",
name="Trade-off",
showlegend=False
),
row=2, col=2
)
fig.update_layout(
title_text="Transformer Models Performance",
title_x=0.5,
height=800,
showlegend=False
)
fig.update_xaxes(title_text="Models", row=1, col=1)
fig.update_xaxes(title_text="Models", row=1, col=2)
fig.update_xaxes(title_text="Models", row=2, col=1)
fig.update_xaxes(title_text="Time (ms)", row=2, col=2)
fig.update_yaxes(title_text="Time (ms)", row=1, col=1)
fig.update_yaxes(title_text="Memory (MB)", row=1, col=2)
fig.update_yaxes(title_text="Parameters", row=2, col=1)
fig.update_yaxes(title_text="Parameters", row=2, col=2)
fig.show()
return fig
performance_fig = create_performance_charts(performance_df)
Summary¶
We compared attention mechanisms across three Transformer architectures and found distinct patterns for each.
1. RoBERTa
2. DistilBERT
3. BERT
When to Use Each Model¶
- High performance tasks → RoBERTa
- Resource constraints → DistilBERT
- General purpose/exploration → BERT
Model Signatures¶
| Model | CLS Agg. | Self-Att. | Ratio C/F | Entropy | Speed |
|---|---|---|---|---|---|
| BERT | 1.00x | ||||
| DistilBERT | |||||
| RoBERTa |
import os
os.makedirs("results", exist_ok=True)
# 1. Sentences from the dataset
sentences_df = pd.DataFrame({
"Sentence_ID": [f"S{i+1}" for i in range(len(TEST_SENTENCES))],
"Domain": SENTENCE_DOMAINS,
"Sentence": TEST_SENTENCES,
"Word_Count": [len(s.split()) for s in TEST_SENTENCES]
})
sentences_df.to_csv("results/dataset_sentences.csv", index=False)
# 2. Metrics by domain
domain_pivot.to_csv("results/metrics_by_domain.csv")
# 3. Metrics by sentence
sentence_metrics = []
for i, (sentence_key, domain) in enumerate(zip(all_patterns.keys(), SENTENCE_DOMAINS)):
for model in models_data.keys():
row = {
"Sentence_ID": f"S{i+1}",
"Domain": domain,
"Model": model,
"CLS_Attention_Max": all_patterns[sentence_key][model]["cls_attention_max"],
"CLS_Attention_Mean": all_patterns[sentence_key][model]["cls_attention_mean"],
"Self_Attention_Mean": all_patterns[sentence_key][model]["self_attention_mean"],
"Content_Function_Ratio": all_patterns[sentence_key][model]["content_vs_function"],
"Attention_Entropy": all_patterns[sentence_key][model]["attention_entropy"]
}
sentence_metrics.append(row)
sentence_metrics_df = pd.DataFrame(sentence_metrics)
sentence_metrics_df.to_csv("results/metrics_by_sentence.csv", index=False)